<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="Minigpt-4">
  <meta name="keywords" content="GPT-4, open-source, vision-language">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Minigpt-4</title>

  <!-- Global site tag (gtag.js) - Google Analytics
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script> -->

  <meta name="google-site-verification" content="6lbYN1vX7A4sD8SrVniq84UEKyEUSBgxeP7d3FjuuK0" />

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <!-- <link rel="icon" href="./static/images/icon.png"> -->
  <link rel="stylesheet" href="./static/css/index.css">

  <link rel="shortcut icon" href="path/to/favicon.ico" type="image/x-icon">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  </head>

  <style>

    #main{
        position: relative;;
        width: 1200px;
    }

    .box{
        float: left;
        padding: 15px 0 0 15px;
/*        background-color: red;*/
    }

    .pic{
        width: 500px;
        padding: 10px;
        border: 1px solid #ccc;
        border-radius: 5px;
        background-color: #fff;
    }

    .pic img{
        width: 500px;
    }

  </style>



  <body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">MiniGPT-4:</h1>
          <h2 class="title is-2 publication-title">Enhancing Vision-language Understanding with Advanced Large Language Models</h2>
          <div class="is-size-5">
            <span class="author-block">
                <a href="https://tsutikgiau.github.io/" style="color:#008AD7;font-weight:normal;">Deyao Zhu<sup>*</sup>
                </a>,                
            </span>
            <span class="author-block">
              <a href="https://junchen14.github.io/" style="color:#008AD7;font-weight:normal;">Jun Chen<sup>*</sup></a>,</span>
            <span class="author-block">
              <a href="https://xiaoqian-shen.github.io/" style="color:#008AD7;font-weight:normal;">Xiaoqian Shen</a>,
            </span>
            <span class="author-block">
              <a href="https://lx709.github.io/" style="color:#008AD7;font-weight:normal;">Xiang Li</a>,
            </span>
            <span class="author-block">
              <a href="https://www.mohamed-elhoseiny.com/" style="color:#008AD7;font-weight:normal;">Mohamed Elhoseiny</a>
            </span>
            
          </div>

          <br>
          <div class="is-size-5 publication-authors">
            <span class="author-block"><b style="color:#008AD7; font-weight:normal">&#x25B6 </b> King Abdullah University of Science and Technology </span>
            <!-- <span class="author-block"><b style="color:#F2A900; font-weight:normal">&#x25B6 </b>UCLA; </span> -->
            <!-- <span class="author-block"><b style="color:#00A4EF; font-weight:normal">&#x25B6 </b>Microsoft Research, Redmond; </span> -->
            <!-- <span class="author-block"><b style="color:#008AD7; font-weight:normal">&#x25B6 </b>Microsoft Cloud & AI </span> -->
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>*</sup>Equal Contribution </span>
            <!-- <span class="author-block"><sup>&#x2628;</sup>Equal Advisory Contribution, </span> -->
            <!-- <span class="author-block"><sup>&#x2691;</sup>Project Lead </span> -->
          </div>

          <br>
         <!--  <div class="is-size-5 publication-authors">
            <span class="author-block"><b style="color:#e08ba0; font-weight:normal"> <b>In CVPR2023</b> </b></span>
          </div> -->


          <div class="column has-text-centered">
            <div class="publication-links">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2304.10592" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              
              <span class="link-block">
                <a href="https://github.com/Vision-CAIR/MiniGPT-4" target="_blank" 
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              
              <span class="link-block">
                      <a href="https://huggingface.co/spaces/Vision-CAIR/minigpt4" target="_blank"
                         class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        🤗
                      </span>
                      <span>Space</span>
                    </a>
                  </span>
              
              <script>
                  window.addEventListener('load', function() {
                    const urls = [
                      'https://4427e15f09c65f1071.gradio.live',
                      'https://3228d8146e5c39b4be.gradio.live',
                      'https://60ac677ed79d2da927.gradio.live',
                      'https://c9cd51f7cae3c9fec1.gradio.live',
                      'https://f2283aee1e9ab52ea9.gradio.live',
                      'https://ecf463e1131cfb7099.gradio.live',
                      'https://d6a81e24b072749a4d.gradio.live',
                      'https://4da94f3035b0462a03.gradio.live',
                    ];
                    const randomIndex = Math.floor(Math.random() * urls.length);
                    const randomURL = urls[randomIndex];
                    document.getElementById('randomLink').href = randomURL;
                  });
                </script>
              
              <span class="link-block">
                <a id="randomLink" href="#" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fa fa-play"></i>
                  </span>
                  <span>Demo</span>
                </a>
              </span>
              
              <span class="link-block">
                <a href="https://youtu.be/__tftoxpBAw" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                  </a>
              </span>
              
              <span class="link-block">
                <a href="https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fa fa-database"></i>
                  </span>
                  <span>Dataset</span>
                  </a>
              </span>
              
              <span class="link-block">
                <a href="https://huggingface.co/Vision-CAIR/MiniGPT-4" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fa fa-laugh"></i>
                  </span>
                  <span>Model</span>
                  </a>
              </span>
              
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>
    
<!-- <section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Thanks for your interest in our work. Currently, the number of users has exceeded our expectations. We provide <strong><font color="#008AD7">alternative demo links</font></strong> here: 
            <a href="https://b2517615b965687635.gradio.live" target="_blank">Demo1</a>
            <a href="https://c8de8ff74b6a6c6a9b.gradio.live" target="_blank">Demo2</a>
            <a href="https://90bc0bac96e6457e8f.gradio.live" target="_blank">Demo3</a>
            <a href="https://cd772059965a71f9e6.gradio.live" target="_blank">Demo4</a>
            <a href="https://48da7e23bcadec7551.gradio.live" target="_blank">Demo5</a>
            <a href="https://687d119023cd37e5fb.gradio.live" target="_blank">Demo6</a>
            <a href="https://0810e8582bcad31944.gradio.live" target="_blank">Demo7</a>
            <a href="https://31c7cdb7e3594e851e.gradio.live" target="_blank">Demo8</a>
       
            <strong><font>News</font></strong>: We now provide a pretrained MiniGPT-4 aligned with <strong><font color="#008AD7">Vicuna-7B</font></strong>! The demo GPU memory consumption now can be <strong><font color="#008AD7">as low as 12GB</font></strong>.
            <br>
            </p>
        </div>
      </div>
    </div>
</section>
 -->
<script>
      window.addEventListener('load', function() {
        const urls = [
          'https://4427e15f09c65f1071.gradio.live',
          'https://3228d8146e5c39b4be.gradio.live',
          'https://60ac677ed79d2da927.gradio.live',
          'https://c9cd51f7cae3c9fec1.gradio.live',
          'https://f2283aee1e9ab52ea9.gradio.live',
          'https://ecf463e1131cfb7099.gradio.live',
          'https://d6a81e24b072749a4d.gradio.live',
          'https://4da94f3035b0462a03.gradio.live',
        ];
        const randomIndex = Math.floor(Math.random() * urls.length);
        const randomURL = urls[randomIndex];
        const iframe = document.getElementById('gradio');
        iframe.setAttribute('src', randomURL);
      });
    </script>

    <iframe id="gradio" width="100%" height="900">
      <p>Gradio.</p>
    </iframe>
  
<!-- <link rel="stylesheet" href="js/ft-carousel.css" />
<script src="js/jquery.min.js"></script>
<script src="js/ft-carousel.min.js"></script>
<script type="text/javascript">
  $("#carousel_1").FtCarousel();

  $("#carousel_2").FtCarousel({
    index: 1,
    auto: false
  });

  $("#carousel_3").FtCarousel({
    index: 0,
    auto: true,
    time: 3000,
    indicators: false,
    buttons: true
  });
</script> -->

<!-- <section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="example">
      <div class="ft-carousel" id="carousel_1">
        <ul class="carousel-inner"> -->
          <!-- <li class="carousel-item"><img src="demos/wop_2.png" /></li>
          <li class="carousel-item"><img src="demos/cook_1.png" /></li>
          <li class="carousel-item"><img src="demos/fix_1.png" /></li>
          <li class="carousel-item"><img src="demos/rhyme_1.png" /></li> -->
       <!--    <li class="carousel-item"><img src="img/a1.jpg" /></li>
      <li class="carousel-item"><img src="img/a2.jpg" /></li>
      <li class="carousel-item"><img src="img/a3.jpg" /></li>
      <li class="carousel-item"><img src="img/a4.jpg" /></li>
      <li class="carousel-item"><img src="img/a5.jpg" /></li>
      <li class="carousel-item"><img src="img/a6.jpg" /></li>
        </ul>
      </div>
    </div>
  </div>
</section> -->

<!-- 
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
       <div class="item">
        <img src="demos/wop_2.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
        </h2>
      </div>
      <div class="item">
        <img src="demos/cook_1.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
        </h2>
      </div>
      <div class="item">
        <img src="demos/fix_1.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
       </h2>
     </div>
     <div class="item">
      <img src="demos/rhyme_1.png" alt="MY ALT TEXT"/>
      <h2 class="subtitle has-text-centered">
      </h2>
    </div>
  </div>
</div>
</div>
</section>
 -->

<link rel="stylesheet" type="text/css" href="js/simple_style.css" />
<script type="text/javascript" src="js/simple_swiper.js"></script>


<!-- <div class="app">
  <div id="swiper-demo" class="simple-swiper-container">
    <a id="prev" class="btn btn-prev"></a>
    <a id="next" class="btn btn-next"></a>
    <div class="pagination"></div>
  </div>
</div>
<p id="index"></p>

<script type="text/javascript">
  new SimSwiper("#swiper-demo", {
    autoplay: 4000,
    duration: 300,
    easing: 'ease',
    button: {
      prev: "#prev", // 前进后退按钮
      next: "#next"
    },
    pagination: {
      el: '.pagination',
      click: true// 是否可以点击
    },
    // 轮播图数据
    data: [{
      index: 0,
      href: '#',
      src: 'demos/wop_2.png'
    }, {
      index: 1,
      href: '#',
      src: 'demos/cook_1.png'
    }, {
      index: 2,
      href: '#',
      src: 'demos/fix_1.png'
    }, {
      index: 3,
      href: '#',
      src: 'demos/rhyme_1.png'
    }]
  });
</script> -->


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            The recent GPT-4 has demonstrated extraordinary multi-modal abilities, such as directly generating websites from handwritten text and identifying humorous elements within images. These features are rarely observed in previous vision-language models. We believe the primary reason for GPT-4's advanced multi-modal generation capabilities lies in the utilization of a more advanced large language model (LLM). To examine this phenomenon, we present MiniGPT-4, which aligns a frozen visual encoder with a frozen LLM, Vicuna, using just one projection layer. Our findings reveal that MiniGPT-4 possesses many capabilities similar to those exhibited by GPT-4 like detailed image description generation and website creation from hand-written drafts. Furthermore, we also observe other emerging capabilities in MiniGPT-4, including writing stories and poems inspired by given images, providing solutions to problems shown in images, teaching users how to cook based on food photos, etc. In our experiment, we found that only performing the pretraining on raw image-text pairs could produce unnatural language outputs that lack coherency including repetition and fragmented sentences. To address this problem, we curate a high-quality, well-aligned dataset in the second stage to finetune our model using a conversational template. This step proved crucial for augmenting the model's generation reliability and overall usability. Notably, our model is highly computationally efficient, as we only train a projection layer utilizing approximately 5 million aligned image-text pairs.
</b>
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
    <br>
    <br>
    <!-- Paper video. -->
    <!-- <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Video</h2>
        <div class="publication-video">
          <iframe width=“560” height=“315" src=“https://www.youtube.com/embed/__tftoxpBAw” title=“YouTube video player” frameborder=“0” allow=“accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share” allowfullscreen></iframe>
        </div>
      </div>
    </div> -->

    
<!-- <section class="hero is-small is-light">
    <div class="hero-body"> -->
        <div class="container">
            <!-- Paper video. -->
            <h2 class="title has-text-centered">Video Presentation</h2>
            <div class="columns is-centered has-text-centered">
                <div class="column is-four-fifths">

                    <div class="publication-video">
                        <!-- Youtube embed code here -->
                        <iframe width="560" height="315" src="https://www.youtube.com/embed/__tftoxpBAw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
                    </div>
                </div>
            </div>
        </div>
<!--     </div>
</section> -->


    <!--/ Demo. -->
    <!-- <br>
    <br>

    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Demo</h2>
      </div>
    </div>

    <div class="column is-full-width">
      <div class="columns is-centered">
        <img id="teaser" width="90%" src="images/demo6_AdobeExpress.gif">
      </div>
      <div class="columns is-centered">
      <h1>
        <p style="font-family:Times New Roman"><b>X-GPT: Connecting generalist X-Decoder with GPT-3</b>
      </h1>                 
      </div>
    </div>

    <br>

    <div class="column is-full-width">
      <div class="columns is-centered">
        <img id="teaser" width="90%" src="images/inpaint.gif">
      </div>
      <div class="columns is-centered">
      <h1>
        <p style="font-family:Times New Roman"><b>Instruct-X-Decoder: Object-centric instructional image editing</b>
      </h1>                 
      </div>
    </div> -->

    <!--/ Paper video. -->
    <br>
    <br>
    <!-- Paper Model. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-six-fifths">
        <h2 class="title is-3">Model</h2>
        <div class="content has-text-justified">
          <p>
            <b>MiniGPT-4 consists of a vision encoder with a pretrained ViT and Q-Former, a single linear projection layer, and an advanced Vicuna large language model. MiniGPT-4 only requires training the linear layer to align the visual features with the Vicuna.</b>:
          </p>
          <ul>
            <!-- <li>It has two types of queries (latent queries and text queries) and outputs (semantic outputs and pixel-level outputs).</li>
            <li>It uses a single text encoder for all text corpus, ranging from class concepts, referring phrases to image captions.</li>
            <li>It decouples image and text encoder to accomadate cross-image tasks (e.g., image-text retrieval) and within-image tasks (e.g., segmentation and captioning).</li> -->

          </ul>
        </div>  
        <img id="model" width="80%" src="images/overview.png">
        <h3 class="subtitle has-text-centered">
          <p style="font-family:Times New Roman"><b>The architecture of MiniGPT-4.</b></p>
        </h3>   
        <br>
        <br>

      </div>
    </div>
    <br>
    <br>    
    <!--/ Paper video. -->
  </div>
</section>


<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>

@article{zou2022xdecoder,
  author      = {Deyao Zhu and Jun Chen and Xiaoqian Shen and xiang Li and Mohamed Elhoseiny},
  title       = {MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models},
  <!-- publisher   = {arXiv:2212.11270v1}, -->
  year        = {2023},
}
</code></pre>
  </div>
</section>


<section class="section" id="Acknowledgement">
  <div class="container is-max-desktop content">
    <h2 class="title">Acknowledgement</h2>
    <p>
      This website is adapted from <a
      href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>, licensed under a <a rel="license"
                                          href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
      Commons Attribution-ShareAlike 4.0 International License</a>.
    </p>
  </div>
</section>

<section class="section">
  <!-- Results. -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-six-fifths">
      <h2 class="title is-3">Results</h2>
      </div>
    </div>
  </div>
  <!--/ Results. -->    
<div class="container is-max-desktop">
</section>


<script src="js/Underscore-min.js"></script>
<script src="js/index.js"></script>


<section class="section">
<div id="main">
  <div class="box"><div class="pic"><img src="demos/ad_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/web_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/ad_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/cook_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/describe_2.png" alt=""></div></div>


  <div class="box"><div class="pic"><img src="demos/fact_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/fact_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/fix_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/fun_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/fun_2.png" alt=""></div></div>

  <div class="box"><div class="pic"><img src="demos/logo_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/op_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/op_2.png" alt=""></div></div>

  <div class="box"><div class="pic"><img src="demos/wop_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/people_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/rhyme_2.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/story_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/story_2.png" alt=""></div></div>
  
  <div class="box"><div class="pic"><img src="demos/describe_1.png" alt=""></div></div>
  <div class="box"><div class="pic"><img src="demos/people_1.png" alt=""></div></div>


</div>

</section>



</body>

</html>
